{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "from functools import partial\n",
    "import warnings\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "from sklearn.externals import joblib\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "\n",
    "sys.path.append('../')\n",
    "from src.utils import parallel_apply\n",
    "from src.feature_extraction import add_features_in_group\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "DIR = '/PATH/TO/YOUR/DATA'\n",
    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
    "bureau = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/bureau.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bureau.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing\n",
    "## Solution 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "(bureau['AMT_CREDIT_SUM'] == 0).sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This shows that imputing with nan with 0 is probably a bad idea"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering\n",
    "## Solution 3\n",
    "### Hand crafted features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bureau[bureau['SK_ID_CURR']==215354]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### First build helper columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bureau['bureau_credit_active_binary'] = (bureau['CREDIT_ACTIVE'] != 'Closed').astype(int)\n",
    "bureau['bureau_credit_enddate_binary'] = (bureau['DAYS_CREDIT_ENDDATE'] > 0).astype(int)\n",
    "groupby_SK_ID_CURR = bureau.groupby(by=['SK_ID_CURR'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bureau.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "description[description['Row'] == 'DAYS_CREDIT'].Description.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = pd.DataFrame({'SK_ID_CURR':bureau['SK_ID_CURR'].unique()})\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['DAYS_CREDIT'].agg('count').reset_index()\n",
    "group_object.rename(index=str, columns={'DAYS_CREDIT': 'bureau_number_of_past_loans'},inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['CREDIT_TYPE'].agg('nunique').reset_index()\n",
    "group_object.rename(index=str, columns={'CREDIT_TYPE': 'bureau_number_of_loan_types'},inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features['bureau_average_of_past_loans_per_type'] = \\\n",
    "    features['bureau_number_of_past_loans'] / features['bureau_number_of_loan_types']\n",
    "    \n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['bureau_credit_active_binary'].agg('mean').reset_index()\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_DEBT'].agg('sum').reset_index()\n",
    "group_object.rename(index=str, columns={'AMT_CREDIT_SUM_DEBT': 'bureau_total_customer_debt'},inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM'].agg('sum').reset_index()\n",
    "group_object.rename(index=str, columns={'AMT_CREDIT_SUM': 'bureau_total_customer_credit'},inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features['bureau_debt_credit_ratio'] = \\\n",
    "    features['bureau_total_customer_debt'] / features['bureau_total_customer_credit']\n",
    "    \n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['AMT_CREDIT_SUM_OVERDUE'].agg('sum').reset_index()\n",
    "group_object.rename(index=str, columns={'AMT_CREDIT_SUM_OVERDUE': 'bureau_total_customer_overdue'},inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features['bureau_overdue_debt_ratio'] = \\\n",
    "    features['bureau_total_customer_overdue'] / features['bureau_total_customer_debt']\n",
    "    \n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['CNT_CREDIT_PROLONG'].agg('sum').reset_index()\n",
    "group_object.rename(index=str, columns={'CNT_CREDIT_PROLONG': 'bureau_total_prolonged_count'},inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['bureau_credit_enddate_binary'].agg('mean').reset_index()\n",
    "group_object.rename(index=str, columns={'bureau_credit_enddate_binary': 'bureau_credit_enddate_percentage'},inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = application.merge(features,\n",
    "                                left_on=['SK_ID_CURR'],\n",
    "                                right_on=['SK_ID_CURR'],\n",
    "                                how='left',\n",
    "                                validate='one_to_one')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "engineered_numerical_columns = list(features.columns)\n",
    "engineered_numerical_columns.remove('SK_ID_CURR')\n",
    "X = X[engineered_numerical_columns + ['TARGET']]\n",
    "X_corr = abs(X.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(X_corr, \n",
    "            xticklabels=X_corr.columns,\n",
    "            yticklabels=X_corr.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "BUREAU_AGGREGATION_RECIPIES = [('CREDIT_TYPE', 'count'),\n",
    "                               ('CREDIT_ACTIVE', 'size')\n",
    "                               ]\n",
    "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n",
    "    for select in ['AMT_ANNUITY',\n",
    "                   'AMT_CREDIT_SUM',\n",
    "                   'AMT_CREDIT_SUM_DEBT',\n",
    "                   'AMT_CREDIT_SUM_LIMIT',\n",
    "                   'AMT_CREDIT_SUM_OVERDUE',\n",
    "                   'AMT_CREDIT_MAX_OVERDUE',\n",
    "                   'CNT_CREDIT_PROLONG',\n",
    "                   'CREDIT_DAY_OVERDUE',\n",
    "                   'DAYS_CREDIT',\n",
    "                   'DAYS_CREDIT_ENDDATE',\n",
    "                   'DAYS_CREDIT_UPDATE'\n",
    "                   ]:\n",
    "        BUREAU_AGGREGATION_RECIPIES.append((select, agg))\n",
    "BUREAU_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], BUREAU_AGGREGATION_RECIPIES)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "groupby_aggregate_names = []\n",
    "for groupby_cols, specs in tqdm(BUREAU_AGGREGATION_RECIPIES):\n",
    "    group_object = bureau.groupby(groupby_cols)\n",
    "    for select, agg in tqdm(specs):\n",
    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
    "        application = application.merge(group_object[select]\n",
    "                              .agg(agg)\n",
    "                              .reset_index()\n",
    "                              .rename(index=str,\n",
    "                                      columns={select: groupby_aggregate_name})\n",
    "                              [groupby_cols + [groupby_aggregate_name]],\n",
    "                              on=groupby_cols,\n",
    "                              how='left')\n",
    "        groupby_aggregate_names.append(groupby_aggregate_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
    "application_agg_corr = abs(application_agg.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Solution 6\n",
    "## Hand Crafted Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bureau['bureau_credit_type_consumer'] = (bureau['CREDIT_TYPE'] == 'Consumer credit').astype(int)\n",
    "bureau['bureau_credit_type_car'] = (bureau['CREDIT_TYPE'] == 'Car loan').astype(int)\n",
    "bureau['bureau_credit_type_mortgage'] = (bureau['CREDIT_TYPE'] == 'Mortgage').astype(int)\n",
    "bureau['bureau_credit_type_credit_card'] = (bureau['CREDIT_TYPE'] == 'Credit card').astype(int)\n",
    "bureau['bureau_credit_type_other'] = (~(bureau['CREDIT_TYPE'].isin(['Consumer credit',\n",
    "                                                        'Car loan', 'Mortgage', 'Credit card']))).astype(int)\n",
    "bureau['bureau_unusual_currency'] = (~(bureau['CREDIT_CURRENCY'] == 'currency 1')).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "groupby_SK_ID_CURR = bureau.groupby(by=['SK_ID_CURR'])\n",
    "group_object = groupby_SK_ID_CURR[['bureau_credit_type_consumer',\n",
    "                                   'bureau_credit_type_car',\n",
    "                                   'bureau_credit_type_mortgage',\n",
    "                                   'bureau_credit_type_credit_card',\n",
    "                                   'bureau_credit_type_other',\n",
    "                                   'bureau_unusual_currency']].agg(['sum', 'mean']).reset_index()\n",
    "\n",
    "group_object.columns = ['{}_{}'.format(a,b) for a,b in group_object.columns.tolist()]\n",
    "group_object.rename(index=str, columns={'SK_ID_CURR_': 'SK_ID_CURR'},inplace=True)\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "COLUMN_NAME = 'DAYS_CREDIT_UPDATE'\n",
    "d = description[description['Row']==COLUMN_NAME]\n",
    "display(d.Description.tolist())\n",
    "display(d.Special.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bureau.sort_values(['SK_ID_CURR','DAYS_CREDIT'],ascending=False, inplace=True)\n",
    "bureau['days_credit_diff'] = bureau['DAYS_CREDIT'].diff().replace(np.nan, 0)    \n",
    "\n",
    "groupby_SK_ID_CURR = bureau.groupby(by=['SK_ID_CURR'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['bureau_credit_active_binary'].agg('sum').reset_index()\n",
    "group_object.rename(index=str, columns={'bureau_credit_active_count': 'bureau_credit_active_binary'},inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = groupby_SK_ID_CURR['SK_ID_BUREAU'].agg('nunique').reset_index()\n",
    "group_object.rename(index=str, columns={'SK_ID_BUREAU': 'bureau_query_count'},inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def last_k_bureau_features(gr, periods):\n",
    "    gr_ = gr.copy()\n",
    "\n",
    "    features = {}\n",
    "    for period in periods:\n",
    "        if period > 10e10:\n",
    "            period_name = 'all_records_'\n",
    "            gr_period = gr_.copy()\n",
    "        else:\n",
    "            period_name = 'last_{}_'.format(period)\n",
    "            gr_period = gr_[gr_['DAYS_CREDIT'] >= (-1) * period]\n",
    "\n",
    "        features = add_features_in_group(features, gr_period, 'days_credit_diff',\n",
    "                                             ['sum', 'min', 'max', 'median', 'std'],\n",
    "                                        period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'CNT_CREDIT_PROLONG',\n",
    "                                             ['sum', 'std'],\n",
    "                                        period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'bureau_credit_active_binary',\n",
    "                                             ['sum', 'mean'],\n",
    "                                        period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'bureau_credit_type_consumer',\n",
    "                                             ['sum', 'mean'],\n",
    "                                        period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'bureau_credit_type_car',\n",
    "                                             ['sum', 'mean'],\n",
    "                                        period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'bureau_credit_type_credit_card',\n",
    "                                             ['sum'],\n",
    "                                        period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'bureau_credit_type_mortgage',\n",
    "                                             ['sum'],\n",
    "                                        period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'bureau_credit_type_other',\n",
    "                                             ['sum', 'mean'],\n",
    "                                        period_name)\n",
    "    return features\n",
    "\n",
    "\n",
    "def trend_in_last_k_bureau_features(gr, periods):\n",
    "    gr_ = gr.copy()\n",
    "    gr_['days_credit_diff'].iloc[0] = 0\n",
    "\n",
    "    features = {}\n",
    "\n",
    "    for period in periods:\n",
    "        gr_period = gr_[gr_['DAYS_CREDIT'] >= (-1) * period]\n",
    "        features = _add_trend_feature(features,gr_period,\n",
    "                                      'days_credit_diff','{}_period_trend_'.format(period)\n",
    "                                     )\n",
    "    return features\n",
    "\n",
    "\n",
    "def _add_trend_feature(features, gr ,feature_name, prefix):\n",
    "    y = gr[feature_name].values\n",
    "    try:\n",
    "        x = np.arange(0,len(y)).reshape(-1,1)\n",
    "        lr = LinearRegression()\n",
    "        lr.fit(x,y)\n",
    "        trend = lr.coef_[0]\n",
    "    except:\n",
    "        trend=np.nan\n",
    "    features['{}{}'.format(prefix,feature_name)] = trend\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func = partial(last_k_bureau_features, periods=[60, 180, 360, 720, 1e25])\n",
    "\n",
    "g = parallel_apply(groupby_SK_ID_CURR, func, index_name='SK_ID_CURR',\n",
    "                   num_workers=10, chunk_size=10000).reset_index()\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')\n",
    "\n",
    "display(features.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = application.merge(features,\n",
    "                      left_on=['SK_ID_CURR'],\n",
    "                      right_on=['SK_ID_CURR'],\n",
    "                      how='left',\n",
    "                      validate='one_to_one')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = list(features.columns)\n",
    "feature_names.remove('SK_ID_CURR')\n",
    "X = X[feature_names + ['TARGET']]\n",
    "X_abs = abs(X.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X_abs.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex = 'credit_active'\n",
    "X_corr_truncated = X_abs.sort_values('TARGET', ascending=False).filter(regex=regex, axis=0)\n",
    "X_corr_truncated['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "colnames = X_corr_truncated.index.tolist() + ['TARGET']\n",
    "X_corr_truncated[colnames]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(X_corr_truncated[colnames], \n",
    "            xticklabels=colnames,\n",
    "            yticklabels=colnames)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
